载入数据集,并查看数据集的相关信息

?wineQualityReds
## No documentation for 'wineQualityReds' in specified packages and libraries:
## you could try '??wineQualityReds'
pf = read.csv('wineQualityReds.csv')
str(pf)
## 'data.frame':    1599 obs. of  13 variables:
##  $ X                   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ fixed.acidity       : num  7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
##  $ volatile.acidity    : num  0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
##  $ citric.acid         : num  0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
##  $ residual.sugar      : num  1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
##  $ chlorides           : num  0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
##  $ free.sulfur.dioxide : num  11 25 15 17 11 13 15 15 9 17 ...
##  $ total.sulfur.dioxide: num  34 67 54 60 34 40 59 21 18 102 ...
##  $ density             : num  0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num  3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
##  $ sulphates           : num  0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
##  $ alcohol             : num  9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
##  $ quality             : int  5 5 5 6 5 5 5 7 7 5 ...
summary(pf)
##        X          fixed.acidity   volatile.acidity  citric.acid   
##  Min.   :   1.0   Min.   : 4.60   Min.   :0.1200   Min.   :0.000  
##  1st Qu.: 400.5   1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090  
##  Median : 800.0   Median : 7.90   Median :0.5200   Median :0.260  
##  Mean   : 800.0   Mean   : 8.32   Mean   :0.5278   Mean   :0.271  
##  3rd Qu.:1199.5   3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420  
##  Max.   :1599.0   Max.   :15.90   Max.   :1.5800   Max.   :1.000  
##  residual.sugar     chlorides       free.sulfur.dioxide
##  Min.   : 0.900   Min.   :0.01200   Min.   : 1.00      
##  1st Qu.: 1.900   1st Qu.:0.07000   1st Qu.: 7.00      
##  Median : 2.200   Median :0.07900   Median :14.00      
##  Mean   : 2.539   Mean   :0.08747   Mean   :15.87      
##  3rd Qu.: 2.600   3rd Qu.:0.09000   3rd Qu.:21.00      
##  Max.   :15.500   Max.   :0.61100   Max.   :72.00      
##  total.sulfur.dioxide    density             pH          sulphates     
##  Min.   :  6.00       Min.   :0.9901   Min.   :2.740   Min.   :0.3300  
##  1st Qu.: 22.00       1st Qu.:0.9956   1st Qu.:3.210   1st Qu.:0.5500  
##  Median : 38.00       Median :0.9968   Median :3.310   Median :0.6200  
##  Mean   : 46.47       Mean   :0.9967   Mean   :3.311   Mean   :0.6581  
##  3rd Qu.: 62.00       3rd Qu.:0.9978   3rd Qu.:3.400   3rd Qu.:0.7300  
##  Max.   :289.00       Max.   :1.0037   Max.   :4.010   Max.   :2.0000  
##     alcohol         quality     
##  Min.   : 8.40   Min.   :3.000  
##  1st Qu.: 9.50   1st Qu.:5.000  
##  Median :10.20   Median :6.000  
##  Mean   :10.42   Mean   :5.636  
##  3rd Qu.:11.10   3rd Qu.:6.000  
##  Max.   :14.90   Max.   :8.000

该数据集共有13个变量,1599条记录。其中X变量quality变量是整型,其余11个变量均为数字。residual.sugar变量中最大值是15.5,而其第三四分位数仅为2.6,可以看做是异常值。同样可看做为异常值的还有chlorides变量free.sulfur.dioxide变量total.sulfur.dioxide变量density变量sulphates变量alcohol变量quality变量的最大值。

该数据集中不同质量的酒的分布情况

library(ggplot2)
table(pf$quality)
## 
##   3   4   5   6   7   8 
##  10  53 681 638 199  18
qplot(quality, data = pf)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

评分为5分和6分的酒的数量明显多于其他评分的酒

酒的质量与其他参数的关系

质量与三种酸度(fixed.acidity、volatile.acidity、citric.acid)的关系

library(gridExtra)
p1_1 = ggplot(aes(quality, fixed.acidity), data = pf) +
  geom_point(alpha = .1, color = 'red')
p1_2 = ggplot(aes(quality, volatile.acidity), data = pf) +
  geom_point(alpha = .1, color = 'blue')
p1_3 = ggplot(aes(quality, citric.acid), data = pf) +
  geom_point(alpha = .1, color = 'yellow')
grid.arrange(p1_1, p1_2, p1_3, ncol = 3)

三种酸度与酒的质量并没有明显的线性关系。在不同质量的酒中,三种酸度的数值跨度都很大。但也可以看出,在质量评分为5和6分的九种,fixed.acidity的数值集中在 4-12 之间,volatile.acidity的数值集中在 0.2-0.8 之间,citric.acid的数值集中在 0.00-0.625 之间。

质量与residual.sugar的关系

ggplot(aes(quality, residual.sugar), data = pf) +
  geom_point(alpha = .1)

从图上看,residual.sugar的数值大部分在1-4之间。重点查看一下评分5和6的酒的residual.sugar数值。

summary(subset(pf, quality==5)$residual.sugar)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.200   1.900   2.200   2.529   2.600  15.500
summary(subset(pf, quality==6)$residual.sugar)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.900   1.900   2.200   2.477   2.500  15.400
p2_1 = ggplot(aes(quality, residual.sugar), 
              data = subset(pf, quality==5)) +
  geom_boxplot(color = 'red') +
  coord_cartesian(ylim = c(1,4)) +
  stat_summary(fun.y = mean, geom = 'point', shape = 4)
p2_2 = ggplot(aes(quality, residual.sugar), 
              data = subset(pf, quality==6)) +
  geom_boxplot(color = 'blue') +
  coord_cartesian(ylim = c(1,4)) +
  stat_summary(fun.y = mean, geom = 'point', shape = 4)
grid.arrange(p2_1, p2_2, ncol = 2)

可以看出,评分5和6的酒的residual.sugar的数值大部分在1-3.5之间,均值分别为2.529和2.477,第一四分位值和中位数完全一致。

ggplot(aes(quality, residual.sugar), data = pf) +
  coord_cartesian(ylim = c(quantile(pf$residual.sugar, .05), 
                           quantile(pf$residual.sugar, .95))) +
  scale_y_continuous(breaks = seq(quantile(pf$residual.sugar, .05),
                                  quantile(pf$residual.sugar, .95),
                                  .2)) +
  geom_point(alpha = 0.1, color = 'orange') +
  geom_line(stat = 'summary', fun.y = mean) +
  geom_line(stat = 'summary', fun.y = quantile, fun.args = list(prob = .1),
            linetype = 2, color = 'blue') +
  geom_line(stat = 'summary', fun.y = quantile, fun.args = list(prob = .5),
            color = 'blue') +
  geom_line(stat = 'summary', fun.y = quantile, fun.args = list(prob = .9),
            linetype = 2, color = 'blue')

不同质量的酒的residual.sugar数值,中位数的均值的变化比较平缓,但是第三四分位数的变化较大。可能是因为除了评分5和6的酒,其他分值的酒样本较少,第三四分位数受极值影响较大。

质量与chlorides的关系

library(reshape2)
summary(dcast(subset(pf, quality==5 | quality==6), 
              X~quality, value.var = 'chlorides'))
##        X                5                6         
##  Min.   :   1.0   Min.   :0.0390   Min.   :0.0340  
##  1st Qu.: 382.5   1st Qu.:0.0740   1st Qu.:0.0682  
##  Median : 768.0   Median :0.0810   Median :0.0780  
##  Mean   : 793.0   Mean   :0.0927   Mean   :0.0850  
##  3rd Qu.:1219.5   3rd Qu.:0.0940   3rd Qu.:0.0880  
##  Max.   :1599.0   Max.   :0.6110   Max.   :0.4150  
##                   NA's   :638      NA's   :681
ggplot(aes(quality, chlorides), data = subset(pf, quality==5 | quality==6)) +
  facet_wrap(~quality) +
  geom_boxplot() +
  coord_cartesian(ylim = c(0.03, 0.15)) +
  stat_summary(fun.y = mean, geom = 'point', shape = 4)
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

可以看出,评分5和6的酒的chlorides的数值大部分在0.04-0.12之间,均值分别为0.0927和0.0850,中位数分别为0.0810和0.0780。

质量与free.sulfur.dioxide、total.sulfur.dioxide、sulphates的关系

p3_1 = ggplot(aes(quality, free.sulfur.dioxide), data = pf) +
  geom_point(alpha = .1, color = 'red')
p3_2 = ggplot(aes(quality, total.sulfur.dioxide), data = pf) +
  geom_point(alpha = .1, color = 'blue')
p3_3 = ggplot(aes(quality, sulphates), data = pf) +
  geom_point(alpha = .1, color = 'orange')
grid.arrange(p3_1, p3_2, p3_3, ncol = 3)

重点查看评分为5和6的酒的free.sulfur.dioxide数据

summary(dcast(subset(pf, quality==5 | quality==6), 
              X~quality, value.var = 'free.sulfur.dioxide'))
##        X                5               6        
##  Min.   :   1.0   Min.   : 3.00   Min.   : 1.00  
##  1st Qu.: 382.5   1st Qu.: 9.00   1st Qu.: 8.00  
##  Median : 768.0   Median :15.00   Median :14.00  
##  Mean   : 793.0   Mean   :16.98   Mean   :15.71  
##  3rd Qu.:1219.5   3rd Qu.:23.00   3rd Qu.:21.00  
##  Max.   :1599.0   Max.   :68.00   Max.   :72.00  
##                   NA's   :638     NA's   :681
ggplot(aes(quality, free.sulfur.dioxide), 
       data = subset(pf, quality==5 | quality==6)) +
  facet_wrap(~quality) +
  geom_boxplot() +
  coord_cartesian(ylim = c(0, 50)) +
  stat_summary(fun.y = mean, geom = 'point', shape = 4)
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

可以看出,评分5和6的酒的free.sulfur.dioxide数据大部分在1-40之间,均值分别为16.98和15.71,中位数分别为15和14。

质量与density、pH、alcohol的关系

p4_1 = ggplot(aes(quality, density), data = pf) +
  geom_point(alpha = .1, color = 'red')
p4_2 = ggplot(aes(quality, pH), data = pf) +
  geom_point(alpha = .1, color = 'blue')
p4_3 = ggplot(aes(quality, alcohol), data = pf) +
  geom_point(alpha = .1, color = 'orange')
grid.arrange(p4_1, p4_2, p4_3, ncol = 3)

评分5和6的酒,density和pH的数值分布较为接近。评分5的酒的alcohol值在9-11.5之间,评分6的酒的alcohol值在9-13之间。

三种酸度之间的关系

fixed.acidity和volatile.acidity的关系

ggplot(aes(fixed.acidity, volatile.acidity), data = pf) +
  geom_point(alpha = .4) +
  geom_smooth(method = 'lm', color = 'orange')

with(pf, cor.test(fixed.acidity, volatile.acidity))
## 
##  Pearson's product-moment correlation
## 
## data:  fixed.acidity and volatile.acidity
## t = -10.589, df = 1597, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3013681 -0.2097433
## sample estimates:
##        cor 
## -0.2561309

fixed.acidity和volatile.acidity之间存在较弱的负相关性

fixed.acidity和citric.acid的关系

ggplot(aes(fixed.acidity, citric.acid), data = pf) +
  geom_point(alpha = .4) +
  geom_smooth(method = 'lm', color = 'orange')

with(pf, cor.test(fixed.acidity, citric.acid))
## 
##  Pearson's product-moment correlation
## 
## data:  fixed.acidity and citric.acid
## t = 36.234, df = 1597, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6438839 0.6977493
## sample estimates:
##       cor 
## 0.6717034

fixed.acidity和citric.acid之间存在较强的正相关性

volatile.acidity和citric.acid的关系

ggplot(aes(volatile.acidity, citric.acid), data = pf) +
  geom_point(alpha = .4) +
  coord_cartesian(ylim = c(0,1)) +
  geom_smooth(method = 'lm', color = 'orange')

with(pf, cor.test(volatile.acidity, citric.acid))
## 
##  Pearson's product-moment correlation
## 
## data:  volatile.acidity and citric.acid
## t = -26.489, df = 1597, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5856550 -0.5174902
## sample estimates:
##        cor 
## -0.5524957

volatile.acidity和citric.acid之间存在较强的负相关性

free.sulfur.dioxide、total.sulfur.dioxide和sulphates之间的关系

free.sulfur.dioxide和total.sulfur.dioxide的关系

ggplot(aes(free.sulfur.dioxide, total.sulfur.dioxide), data = pf) +
  geom_point(alpha = .2)

with(pf, cor.test(free.sulfur.dioxide, total.sulfur.dioxide))
## 
##  Pearson's product-moment correlation
## 
## data:  free.sulfur.dioxide and total.sulfur.dioxide
## t = 35.84, df = 1597, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6395786 0.6939740
## sample estimates:
##       cor 
## 0.6676665

free.sulfur.dioxide和total.sulfur.dioxide之间存在较强的正相关性

free.sulfur.dioxide、total.sulfur.dioxide和sulphates之间的关系

p5_1 = ggplot(aes(0.25*round(sulphates/0.25), free.sulfur.dioxide), data = pf) +
  geom_line(stat = 'summary', fun.y = mean, color = 'red')
p5_2 = ggplot(aes(0.25*round(sulphates/0.25), total.sulfur.dioxide), data = pf) +
  geom_line(stat = 'summary', fun.y = mean, color = 'blue')
grid.arrange(p5_1, p5_2, ncol = 2)

with(pf, cor.test(free.sulfur.dioxide, sulphates))
## 
##  Pearson's product-moment correlation
## 
## data:  free.sulfur.dioxide and sulphates
## t = 2.0671, df = 1597, p-value = 0.03888
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.002643125 0.100424406
## sample estimates:
##        cor 
## 0.05165757
with(pf, cor.test(total.sulfur.dioxide, sulphates))
## 
##  Pearson's product-moment correlation
## 
## data:  total.sulfur.dioxide and sulphates
## t = 1.7178, df = 1597, p-value = 0.08602
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.006087119  0.091774762
## sample estimates:
##        cor 
## 0.04294684

从相关系数上看,free.sulfur.dioxide、total.sulfur.dioxide和sulphates之间并无相关性。在增加x轴(sulphates)的组距至0.25后发现,free.sulfur.dioxide和total.sulfur.dioxide在组距0.25时的均值随sulphates的增加而增加。

alcohol、residual.sugar和density的关系

ggplot(aes(alcohol, density), data = pf) +
  geom_point(alpha = .25, color = 'orange') +
  geom_line(stat = 'summary', fun.y = mean) +
  geom_line(stat = 'summary', fun.y = median, linetype = 2)

with(pf, cor.test(alcohol, density))
## 
##  Pearson's product-moment correlation
## 
## data:  alcohol and density
## t = -22.838, df = 1597, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5322547 -0.4583061
## sample estimates:
##        cor 
## -0.4961798

alcohol与density之间存在中度负相关性。

ggplot(aes(residual.sugar, density), data = pf) +
  coord_cartesian(xlim = c(quantile(pf$residual.sugar, .01),
                           quantile(pf$residual.sugar, .95))) +
  geom_point(alpha = .25, color = 'orange') +
  geom_line(stat = 'summary', fun.y = mean) +
  geom_line(stat = 'summary', fun.y = median, linetype = 2)

with(pf, cor.test(residual.sugar, density))
## 
##  Pearson's product-moment correlation
## 
## data:  residual.sugar and density
## t = 15.189, df = 1597, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3116908 0.3973835
## sample estimates:
##       cor 
## 0.3552834

residual.sugar与density之间存在中度正相关性

pH值与酸度的关系

pH值与fixed.acidity、volatile.acidity、citric.acid的关系

p6_1 = ggplot(aes(pH, fixed.acidity), data = pf) +
  geom_point(alpha = .25, color = 'red')
p6_2 = ggplot(aes(pH, volatile.acidity), data = pf) +
  geom_point(alpha = .25, color = 'blue')
p6_3 = ggplot(aes(pH, citric.acid), data = pf) +
  geom_point(alpha = .25, color = 'orange')
grid.arrange(p6_1, p6_2, p6_3, ncol = 2)

with(pf, cor.test(pH, fixed.acidity))
## 
##  Pearson's product-moment correlation
## 
## data:  pH and fixed.acidity
## t = -37.366, df = 1597, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.7082857 -0.6559174
## sample estimates:
##        cor 
## -0.6829782
with(pf, cor.test(pH, volatile.acidity))
## 
##  Pearson's product-moment correlation
## 
## data:  pH and volatile.acidity
## t = 9.659, df = 1597, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1880823 0.2807254
## sample estimates:
##       cor 
## 0.2349373
with(pf, cor.test(pH, citric.acid))
## 
##  Pearson's product-moment correlation
## 
## data:  pH and citric.acid
## t = -25.767, df = 1597, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5756337 -0.5063336
## sample estimates:
##        cor 
## -0.5419041

fixed.acidity、volatile.acidity、citric.acid三项指标中,fixed.acidity与pH值的相关性最大,相关系数为-0.683。

不同密度的酒的pH值与fixed.acidity的关系

ggplot(aes(fixed.acidity, pH), data = pf) +
  geom_point(aes(color = density))

ggplot(aes(fixed.acidity, pH), data = pf) +  
  geom_line(aes(color = density), stat = 'summary', fun.y = median)

可以看出,pH值的中位数随fixed.acidity值的增加而降低。密度较小的酒pH值一般较高,密度较大的酒一般pH值较低。

pH值与residual.sugar的关系

ggplot(aes(residual.sugar, pH), data = pf) +
  coord_cartesian(xlim = c(quantile(pf$residual.sugar, .05),
                           quantile(pf$residual.sugar, .95))) +
  geom_point(alpha = .5)

with(pf, cor.test(residual.sugar, pH))
## 
##  Pearson's product-moment correlation
## 
## data:  residual.sugar and pH
## t = -3.4355, df = 1597, p-value = 0.0006066
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.13411046 -0.03678574
## sample estimates:
##         cor 
## -0.08565242

pH值与residual.sugar没有相关性。

结论

数据集中酒的质量的分布情况

ggplot(aes(quality), data = pf) +
  geom_histogram(binwidth = .5) +
  scale_x_continuous(breaks = seq(3,8,1)) +
  xlab('Quality Score') +
  ylab('Number of Sample') +
  labs(title = 'Distribution of Quality Score') +
  theme(plot.title = element_text(hjust = 0.5)) #标题居中

样本中评分为5和6的酒的数量最多,数量远大于其他分值的酒。

评分为5和6的样本中,residual.sugar数值的分布情况

p2_1 = ggplot(aes(quality, residual.sugar), 
              data = subset(pf, quality==5)) +
  geom_boxplot(color = 'red') +
  xlab('') +
  ylab('') +
  coord_cartesian(ylim = c(1,4)) +
  scale_y_continuous(breaks = seq(1,4,.1)) +
  scale_x_continuous(breaks = seq(4.5,5.5,.5)) +
  stat_summary(fun.y = mean, geom = 'point', shape = 4)
p2_2 = ggplot(aes(quality, residual.sugar), 
              data = subset(pf, quality==6)) +
  geom_boxplot(color = 'blue') +
  xlab('') +
  ylab('') +
  coord_cartesian(ylim = c(1,4)) +
  scale_y_continuous(breaks = seq(1,4,.1)) +
  scale_x_continuous(breaks = seq(5.5,6.5,.5)) +
  stat_summary(fun.y = mean, geom = 'point', shape = 4)
grid.arrange(p2_1, p2_2, ncol = 2, 
             left = 'Residual.sugar Value',
             top = 'Distribution of Residual.sugar with Score 5 and 6',
             bottom = 'Quality Score')

residual.sugar的数值大部分在1-3.5之间,均值分别为2.529和2.477,第一四分位值和中位数完全一致。

不同密度的酒的pH值与fixed.acidity的关系

ggplot(aes(fixed.acidity, pH), data = pf) +  
  geom_line(aes(color = density), stat = 'summary', fun.y = median) +
  scale_x_continuous(breaks = seq(6, 16, 2)) +
  labs(x = 'Fixed.acidity Value',
       y = 'pH Value',
       title = 'Relationship of Fixed.acidity and pH with Different Density') +
  theme(plot.title = element_text(hjust = 0.5))

pH值的中位数随fixed.acidity值的增加而降低。密度较小的酒pH值一般较高,密度较大的酒一般pH值较低。

反思

开始时,我认为数据间会存在明显的相关性,但随着分析的深入,发现相关性并没有那么明显。例如,我在分析residual.sugar和pH的关系时,开始时认为residual.sugar的值越小,pH也应该越小,但通过分析发现,二者并没有相关性。 在未来的分析中,应当注意不要在分析数据前就形成先入为主的想法,容易造成方向错误。